#!/usr/bin/env Rscript

"> methepic_emseq_wgbs_per-pos_beta.R <

Reads in betas from MethylationEPIC arrays, EM-seq and WGBS, and compares them 
on a per-position basis.

For short read tech (EM-seq/WGBS), coverage cutoffs are defined as having
at least 3 of 4 samples from the same method having a coverage of 5 and
above (i.e., EM-seq 3 of 4 with coverage >= 5 && WGBS 3 of 4 with coverage >= 5).
MethylationEPIC readouts are 'analogue' so no 'coverage' to deal with.
" -> doc

setwd('~/csiro/stopwatch/cpgberus/14_methepic_vs_emseq_wgbs/')

# colour codes are from Dark2 panel
EMSEQ_COLOR <- '#1b9e77'
EPIC_COLOR <- '#e7298a'
WGBS_COLOR <- '#7570b3'

suppressPackageStartupMessages({
  library(Biostrings)
  library(BSgenome.Hsapiens.UCSC.hg38)
  library(cowplot)
  library(ggplot2)
  library(RColorBrewer)
})

# function to pretty-print diagnostic messages
diag_message <- function(...) {
  message('[', format(Sys.time(), "%H:%M:%S"), '] ', ...)
}

# function to annotate line of best fit in scatterplot
lm_eqn <- function(df, x, y) {
  # modified from https://stackoverflow.com/questions/7549694/add-regression-line-equation-and-r2-on-graph
  model <- lm(as.formula(paste(y, '~', x)), df)
  eq <- substitute(
    italic(y) == c + m %.% italic(x)*','~italic(r)^2 == r2*','~italic(p) == pval*','~italic(n) == n_df,
    list(c = format(unname(coef(model)[1]), digits=3),
         m = format(unname(coef(model)[2]), digits=3),
         r2 = format(summary(model)$r.squared, digits=4),
         pval = format(summary(model)$coefficients[2,4], digits=1),
         n_df = format(nrow(df), big.mark=',')))
  as.character(as.expression(eq))
}

# function to calculate sum/len of a binary string
pct_of_ones <- function(binary_string) {
  int_vector <- as.integer(unlist(strsplit(binary_string, split = "")))
  
  sum(int_vector) / length(int_vector) * 100
}

# load processed EPIC data
load('epic_betas_hg38.RData')

# fix EPIC GRanges--all of the current positions refer to the 'C' on the Watson
# strand. this is correct when strand is '+', but not correct when strand is
# '-' (methylated C on Crick would be the G on Watson)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
##              seqnames    ranges strand |  WR025V1I  WR025V9I  WR069V1I
##                 <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric>
##   cg18478105    chr20  63216298      + | 0.0173053 0.0199615 0.0217486
##   cg09835024     chrX  24054523      + | 0.0326714 0.0517594 0.0346174
##   cg14361672     chr9 128701657      - | 0.8865378 0.8594833 0.8773512
##   cg01763666    chr17  82201630      - | 0.8258608 0.8342132 0.7833352
##   cg12950382    chr14 104710399      - | 0.9221174 0.9384120 0.9030496
##   cg02115394    chr13 114234693      - | 0.0870580 0.0858822 0.0855477
##   cg25813447     chrX  38801258      + | 0.4203698 0.3365862 0.3448609
##   cg07779434     chrX  14873227      + | 0.3743789 0.3703045 0.3686351
##   cg13417420    chr12  12696225      + | 0.0308294 0.0230513 0.0306978
##   cg12480843     chr8  73879050      + | 0.0257384 0.0236206 0.0288161
##               WR069V9I
##              <numeric>
##   cg18478105 0.0168954
##   cg09835024 0.0512708
##   cg14361672 0.8487040
##   cg01763666 0.8522306
##   cg12950382 0.9232085
##   cg02115394 0.0777868
##   cg25813447 0.3797780
##   cg07779434 0.3673453
##   cg13417420 0.0264349
##   cg12480843 0.0262686
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
##      width seq                                              names               
##  [1]     1 C                                                cg18478105
##  [2]     1 C                                                cg09835024
##  [3]     1 G                                                cg14361672
##  [4]     1 G                                                cg01763666
##  [5]     1 G                                                cg12950382
##  [6]     1 G                                                cg02115394
##  [7]     1 C                                                cg25813447
##  [8]     1 C                                                cg07779434
##  [9]     1 C                                                cg13417420
## [10]     1 C                                                cg12480843
ranges(epic_gr)[strand(epic_gr) == '-'] <- shift(ranges(epic_gr)[strand(epic_gr) == '-'], 1)
head(epic_gr, 10)
## GRanges object with 10 ranges and 4 metadata columns:
##              seqnames    ranges strand |  WR025V1I  WR025V9I  WR069V1I
##                 <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric>
##   cg18478105    chr20  63216298      + | 0.0173053 0.0199615 0.0217486
##   cg09835024     chrX  24054523      + | 0.0326714 0.0517594 0.0346174
##   cg14361672     chr9 128701658      - | 0.8865378 0.8594833 0.8773512
##   cg01763666    chr17  82201631      - | 0.8258608 0.8342132 0.7833352
##   cg12950382    chr14 104710400      - | 0.9221174 0.9384120 0.9030496
##   cg02115394    chr13 114234694      - | 0.0870580 0.0858822 0.0855477
##   cg25813447     chrX  38801258      + | 0.4203698 0.3365862 0.3448609
##   cg07779434     chrX  14873227      + | 0.3743789 0.3703045 0.3686351
##   cg13417420    chr12  12696225      + | 0.0308294 0.0230513 0.0306978
##   cg12480843     chr8  73879050      + | 0.0257384 0.0236206 0.0288161
##               WR069V9I
##              <numeric>
##   cg18478105 0.0168954
##   cg09835024 0.0512708
##   cg14361672 0.8487040
##   cg01763666 0.8522306
##   cg12950382 0.9232085
##   cg02115394 0.0777868
##   cg25813447 0.3797780
##   cg07779434 0.3673453
##   cg13417420 0.0264349
##   cg12480843 0.0262686
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
getSeq(BSgenome.Hsapiens.UCSC.hg38, head(epic_gr, 10))
## DNAStringSet object of length 10:
##      width seq                                              names               
##  [1]     1 C                                                cg18478105
##  [2]     1 C                                                cg09835024
##  [3]     1 C                                                cg14361672
##  [4]     1 C                                                cg01763666
##  [5]     1 C                                                cg12950382
##  [6]     1 C                                                cg02115394
##  [7]     1 C                                                cg25813447
##  [8]     1 C                                                cg07779434
##  [9]     1 C                                                cg13417420
## [10]     1 C                                                cg12480843
# change beta in EPIC GRanges into methylation levels (x 100%)
values(epic_gr) <- as.matrix(values(epic_gr)) * 100
epic_gr
## GRanges object with 864755 ranges and 4 metadata columns:
##              seqnames    ranges strand |  WR025V1I  WR025V9I  WR069V1I
##                 <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric>
##   cg18478105    chr20  63216298      + |   1.73053   1.99615   2.17486
##   cg09835024     chrX  24054523      + |   3.26714   5.17594   3.46174
##   cg14361672     chr9 128701658      - |  88.65378  85.94833  87.73512
##   cg01763666    chr17  82201631      - |  82.58608  83.42132  78.33352
##   cg12950382    chr14 104710400      - |  92.21174  93.84120  90.30496
##          ...      ...       ...    ... .       ...       ...       ...
##   cg23079522     chr3 160851840      + |  81.94656  85.42163  85.21876
##   cg16818145     chr3 183064489      + |  81.73503  71.50334  78.85068
##   cg14585103     chr8 138928365      + |  68.51512  62.61529  68.91321
##   cg10633746    chr17  18261129      - |   8.23758   7.12615   8.98915
##   cg12623625     chr1  17620429      - |  54.71393  53.53235  52.03187
##               WR069V9I
##              <numeric>
##   cg18478105   1.68954
##   cg09835024   5.12708
##   cg14361672  84.87040
##   cg01763666  85.22306
##   cg12950382  92.32085
##          ...       ...
##   cg23079522  83.80077
##   cg16818145  72.00106
##   cg14585103  67.36199
##   cg10633746   8.68595
##   cg12623625  54.87539
##   -------
##   seqinfo: 25 sequences (1 circular) from hg38 genome
# load per-position data from EM-seq and WGBS
cov_df <- read.delim('rarefied.coverages.tsv.gz')
beta_df <- read.delim('rarefied.methpcts.tsv.gz')

# filtering `beta_df` for positions with min 5 coverage in ALL 8 rarefied
# samples was too stringent: initial union of all 55m positions (covered at
# least once in at least one sample) drops to < 1m post-filtering.
#
# requiring min 3 of 4 EM-seq samples && min 3 of 4 WGBS samples produces a
# more workable set of positions, ~7.7m.
#
# get numbers that summarises coverages across EM-seq and WGBS samples
diag_message('Number of positions with min 5 coverage in 4/4 EM-seq samples: ',
             sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) == 4))
## [18:09:41] Number of positions with min 5 coverage in 4/4 EM-seq samples: 9568385
diag_message('Number of positions with min 5 coverage in 4/4 WGBS samples: ',
             sum(rowSums(cov_df[, grepl('WR$', colnames(cov_df))] >= 5) == 4))
## [18:09:50] Number of positions with min 5 coverage in 4/4 WGBS samples: 3274673
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
             sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) >= 3))
## [18:09:52] Number of positions with min 5 coverage in 3/4 EM-seq samples: 27615442
diag_message('Number of positions with min 5 coverage in 3/4 EM-seq samples: ',
             sum(rowSums(cov_df[, grepl('ER$', colnames(cov_df))] >= 5) >= 3))
## [18:09:55] Number of positions with min 5 coverage in 3/4 EM-seq samples: 27615442
# ... WGBS not performing too hot here, tsk tsk

# note: for these lines to work, positions in `beta_df` and `cov_df` must have 
#       the same order (guaranteed by upstream Python script). else `beta_df` 
#       will not be sliced properly
diag_message('Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5. ')
## [18:09:57] Filter `beta_df` for positions where 3/4 EM-seq samples AND 3/4 WGBS samples have coverage >= 5.
diag_message('Original nrow: ', nrow(beta_df), '; ')
## [18:09:57] Original nrow: 57672689;
beta_df <- beta_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
                     rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
cov_df <- cov_df[rowSums(cov_df[grepl('ER$', colnames(cov_df))] >= 5) >= 3 &
                   rowSums(cov_df[grepl('WR$', colnames(cov_df))] >= 5) >= 3, ]
diag_message('filtered nrow: ', nrow(beta_df))
## [18:10:13] filtered nrow: 7744836
diag_message('Mean coverage of remaining positions are: ',
             'EM-seq ', mean(as.matrix(cov_df[grepl('ER$', colnames(cov_df))])), '; ',
             'WGBS ', mean(as.matrix(cov_df[grepl('WR$', colnames(cov_df))])))
## [18:10:13] Mean coverage of remaining positions are: EM-seq 7.59187689707051; WGBS 6.70286546416219
# again, WGBS not performing well in terms of coverage

# convert positions from `beta_df` into a GRanges object, so that intersecting
# EPIC data (already in GRanges object) is easy
beta_gr <- GRanges(paste(beta_df$scaffold, beta_df$start_pos, sep=':'))
mcols(beta_gr) <- beta_df[4:ncol(beta_df)]
head(beta_gr)
## GRanges object with 6 ranges and 8 metadata columns:
##       seqnames    ranges strand | WR025V1ER WR025V1WR WR025V9ER WR025V9WR
##          <Rle> <IRanges>  <Rle> | <numeric> <numeric> <numeric> <numeric>
##   [1]     chr1     16244      * |    80.000    77.778    85.714        60
##   [2]     chr1     17407      * |    88.889   100.000   100.000       100
##   [3]     chr1     17452      * |    94.118   100.000   100.000       100
##   [4]     chr1     17453      * |    90.000   100.000   100.000       100
##   [5]     chr1     17478      * |   100.000   100.000   100.000       100
##   [6]     chr1     17479      * |    92.857   100.000    92.308       100
##       WR069V1ER WR069V1WR WR069V9ER WR069V9WR
##       <numeric> <numeric> <numeric> <numeric>
##   [1]    71.429    83.333    50.000    77.778
##   [2]   100.000   100.000   100.000    83.333
##   [3]   100.000   100.000   100.000   100.000
##   [4]   100.000   100.000   100.000   100.000
##   [5]   100.000    92.308   100.000   100.000
##   [6]    87.500   100.000    85.714   100.000
##   -------
##   seqinfo: 25 sequences from an unspecified genome; no seqlengths
# aggressively subset both GRanges to the intersection of both
diag_message('# positions in `epic_gr` before intersection: ', length(epic_gr))
## [18:10:40] # positions in `epic_gr` before intersection: 864755
epic_gr <- subsetByOverlaps(epic_gr, beta_gr, type='equal', ignore.strand=TRUE)
beta_gr <- subsetByOverlaps(beta_gr, epic_gr, type='equal', ignore.strand=TRUE)
epic_gr <- sort(sortSeqlevels(epic_gr), ignore.strand=TRUE)
beta_gr <- sort(sortSeqlevels(beta_gr), ignore.strand=TRUE)
diag_message('# positions in `epic_gr` after intersection: ', length(epic_gr))
## [18:10:42] # positions in `epic_gr` after intersection: 103670
# sanity check after subsetting: lengths are identical, sequence names are in
# same order, as well as start/end values
stopifnot(length(epic_gr) == length(beta_gr))
stopifnot(identical(seqnames(epic_gr), seqnames(beta_gr)))
stopifnot(identical(start(epic_gr), start(beta_gr)))
stopifnot(identical(end(epic_gr), end(beta_gr)))

# this allows for the cbinding of metadata together. store the metadata
# in `epic_gr`, which is better annotated than `beta_gr`
values(epic_gr) <- cbind(values(beta_gr), values(epic_gr))

# transform the metadata into a proper data.frame (not the 'DataFrame' S4 object
# used in GRanges), then calculate per-method mean methylation levels
wide_df <- as.data.frame(values(epic_gr))
wide_df$meanER <- rowMeans(wide_df[, grepl('ER$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanWR <- rowMeans(wide_df[, grepl('WR$', colnames(wide_df))], na.rm=TRUE)
wide_df$meanI <- rowMeans(wide_df[, grepl('I$', colnames(wide_df))])
head(wide_df)
##            WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620   100.000    85.714   100.000    80.000   100.000   100.000
## cg02288058   100.000    80.000    85.000    85.714     5.882     6.154
## cg11330075    71.429   100.000    80.000    70.000    83.333    60.000
## cg01068023   100.000   100.000   100.000   100.000   100.000   100.000
## cg05440980   100.000   100.000    66.667   100.000    90.000    50.000
## cg25838738   100.000   100.000   100.000    80.000   100.000   100.000
##            WR069V9ER WR069V9WR WR025V1I WR025V9I WR069V1I WR069V9I    meanER
## cg24335620    92.308   100.000 77.68391 72.99938 77.39380 75.01038  98.07700
## cg02288058     3.774     2.174 37.77661 29.32215 36.55422 30.35048  48.66400
## cg11330075    40.000   100.000 16.61267 16.25537 18.78874 16.81365  68.69050
## cg01068023   100.000   100.000 83.65245 83.75699 83.47850 86.62742 100.00000
## cg05440980   100.000   100.000 90.88851 89.95479 89.70510 90.04642  89.16675
## cg25838738   100.000   100.000 87.04974 86.25925 86.39985 88.13814 100.00000
##              meanWR    meanI
## cg24335620  91.4285 75.77187
## cg02288058  43.5105 33.50087
## cg11330075  82.5000 17.11761
## cg01068023 100.0000 84.37884
## cg05440980  87.5000 90.14871
## cg25838738  95.0000 86.96174
# plot a PCA to visualise overall profile of datasets. note that PCA doesn't
# like dealing with NA, necessitating the use of complete.cases() to remove
# any rows with NA in them
set.seed(1337)
pca <- prcomp(t(wide_df[complete.cases(wide_df), ]), scale=TRUE)
pca_coords <- as.data.frame(pca$x)
eigs <- pca$sdev ^ 2

pca_df <- data.frame(PC1=pca_coords$PC1,
                     PC2=pca_coords$PC2,
                     PC3=pca_coords$PC3,
                     row.names=rownames(pca_coords))
pca_df$method <- rownames(pca_coords)
pca_df$method <- gsub('^.*ER$', 'EM-seq', pca_df$method)
pca_df$method <- gsub('^.*WR$', 'WGBS', pca_df$method)
pca_df$method <- gsub('^.*I$', 'EPIC', pca_df$method)
pca_df$sample <- gsub('ER$|WR$|I$', '', rownames(pca_df))

g1 <- ggplot(pca_df, aes(x=PC1, y=PC2, color=method, fill=method, shape=sample)) +
  geom_point(size=3, alpha=0.5) +
  scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_shape_manual(values=21:25) +
  xlab(paste0('PC1 (', round(eigs[1] / sum(eigs) * 100, 2), '%)')) +
  ylab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
  theme_minimal(12) +
  theme(legend.position='top', legend.box='vertical', legend.margin=margin())
g2 <- ggplot(pca_df, aes(x=PC2, y=PC3, color=method, fill=method, shape=sample)) +
  geom_point(size=3, alpha=0.5) +
  scale_color_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_fill_manual(values=c(EMSEQ_COLOR, EPIC_COLOR, WGBS_COLOR)) +
  scale_shape_manual(values=21:25) +
  xlab(paste0('PC2 (', round(eigs[2] / sum(eigs) * 100, 2), '%)')) +
  ylab(paste0('PC3 (', round(eigs[3] / sum(eigs) * 100, 2), '%)')) +
  theme_minimal(12) +
  theme(legend.position='none', legend.box='vertical', legend.margin=margin())
plot_grid(g1, g2, ncol=1, align='v', rel_heights=c(0.55, 0.45))

ggsave('three-way.beta.pca.pdf', width=6, height=6)
# PCA indicates variation is largest across methods (specifically EPIC vs.
# the other two short read methods). There is some variation across samples,
# seen in PC3 (approx PC2 in terms of % variation explained). In PC3, the
# four-sided shapes are on top (WR025) while the three-sided ones are bottom
# (WR069). However, most variation still originate from choice of method,
# hence the calculation of per-method means; subsequent plots are based off
# these mean values
# plot EM-seq vs. WGBS
ggplot(wide_df, aes(x=meanER, y=meanWR)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position methylation levels, EM-seq vs. WGBS') +
  xlab('EM-seq') +
  ylab('WGBS') +
  theme_minimal(12)

# plot EM-seq vs. EPIC
ggplot(wide_df, aes(x=meanER, y=meanI)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
  ggtitle('Per-position methylation levels, EM-seq vs. EPIC') +
  xlab('EM-seq') +
  ylab('EPIC') +
  theme_minimal(12)

# WGBS vs. EPIC
ggplot(wide_df, aes(x=meanWR, y=meanI)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
  ggtitle('Per-position methylation levels, WGBS vs. EPIC') +
  xlab('WGBS') +
  ylab('EPIC') +
  theme_minimal(12)

# does GC% influence the discrepancies in beta? re-do scatterplot, but colour
# points by GC%
#
# calculate GC% in a window of WINDOW_BP
WINDOW_BP <- 101  # this value should be odd, so that the base at midpoint
                  # is sandwiched by equal num of bases upstream and downstream
bp_per_side <- (WINDOW_BP - 1) / 2

# letterFrequency(CG) / letterfrequency(ACGT) deals with edge cases where windows
# contain N or non-ACGT bases, which get ignored
window_seqs <- getSeq(BSgenome.Hsapiens.UCSC.hg38, epic_gr + bp_per_side)
wide_df$gcpct <- as.vector(
  letterFrequency(window_seqs, 'CG') / letterFrequency(window_seqs, 'ACGT') * 100)
head(wide_df)
##            WR025V1ER WR025V1WR WR025V9ER WR025V9WR WR069V1ER WR069V1WR
## cg24335620   100.000    85.714   100.000    80.000   100.000   100.000
## cg02288058   100.000    80.000    85.000    85.714     5.882     6.154
## cg11330075    71.429   100.000    80.000    70.000    83.333    60.000
## cg01068023   100.000   100.000   100.000   100.000   100.000   100.000
## cg05440980   100.000   100.000    66.667   100.000    90.000    50.000
## cg25838738   100.000   100.000   100.000    80.000   100.000   100.000
##            WR069V9ER WR069V9WR WR025V1I WR025V9I WR069V1I WR069V9I    meanER
## cg24335620    92.308   100.000 77.68391 72.99938 77.39380 75.01038  98.07700
## cg02288058     3.774     2.174 37.77661 29.32215 36.55422 30.35048  48.66400
## cg11330075    40.000   100.000 16.61267 16.25537 18.78874 16.81365  68.69050
## cg01068023   100.000   100.000 83.65245 83.75699 83.47850 86.62742 100.00000
## cg05440980   100.000   100.000 90.88851 89.95479 89.70510 90.04642  89.16675
## cg25838738   100.000   100.000 87.04974 86.25925 86.39985 88.13814 100.00000
##              meanWR    meanI    gcpct
## cg24335620  91.4285 75.77187 64.35644
## cg02288058  43.5105 33.50087 32.67327
## cg11330075  82.5000 17.11761 49.50495
## cg01068023 100.0000 84.37884 49.50495
## cg05440980  87.5000 90.14871 46.53465
## cg25838738  95.0000 86.96174 40.59406
# check distribution of GC% values, to select a colour scheme for the heatmap
# centered appropriately over the median
quantile(wide_df$gcpct, c(0.05, .1, .5, .9, .95))
##       5%      10%      50%      90%      95% 
## 32.67327 34.65347 46.53465 60.39604 63.36634
# hmm. human genome GC% is 42%, but perhaps EPIC probes are preferentially
# chosen around CpG islands, bumping the GC% up slightly. choose a [30, 70]
# colour scale for subsequent plots
# 
# plot EM-seq vs. WGBS and colour points by GC%
g3 <- ggplot(wide_df, aes(x=meanER, y=meanWR, color=gcpct)) +
  geom_point(size=0.5, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanWR'), parse=TRUE) +
  ggtitle('Per-position methylation levels') +
  xlab('EM-seq (%)') +
  ylab('WGBS (%)') +
  theme_minimal(12) +
  theme(legend.position=c(0.75, 0.1))

# plot EM-seq vs. EPIC and colour points by GC%
g4 <- ggplot(wide_df, aes(x=meanER, y=meanI, color=gcpct)) +
  geom_point(size=0.5, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanER', 'meanI'), parse=TRUE) +
  ggtitle('') +
  xlab('EM-seq (%)') +
  ylab('EPIC (%)') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# plot WGBS vs. EPIC and colour points by GC%
g5 <- ggplot(wide_df, aes(x=meanWR, y=meanI, color=gcpct)) +
  geom_point(size=0.5, alpha=0.2) + 
  scale_color_distiller('GC%', palette='RdYlBu', limits=c(30, 70),
                        guide=guide_colorbar(direction='horizontal')) +
  geom_line(stat='smooth', method=lm, formula='y ~ x', alpha=0.3) +
  coord_cartesian(xlim=c(0, 100), ylim=c(0, 100)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'meanWR', 'meanI'), parse=TRUE) +
  ggtitle('') +
  xlab('WGBS (%)') +
  ylab('EPIC (%)') +
  theme_minimal() +
  theme(legend.position=c(0.75, 0.1))

# do, like, proper stats
wide_df$delta_wgbs_emseq <- wide_df$meanWR - wide_df$meanER
summary(lm(wide_df$delta_wgbs_emseq ~ wide_df$gcpct))
## 
## Call:
## lm(formula = wide_df$delta_wgbs_emseq ~ wide_df$gcpct)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.561  -4.527   0.354   4.517  53.147 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -0.3372817  0.1436978  -2.347   0.0189 *
## wide_df$gcpct -0.0003291  0.0029985  -0.110   0.9126  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.231 on 103668 degrees of freedom
## Multiple R-squared:  1.162e-07,  Adjusted R-squared:  -9.53e-06 
## F-statistic: 0.01204 on 1 and 103668 DF,  p-value: 0.9126
g6 <- ggplot(wide_df, aes(x=gcpct, y=delta_wgbs_emseq)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(20, 80), ylim=c(-50, 50)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_wgbs_emseq'), parse=TRUE) +
  ggtitle('Per-position residual methylation levels vs. GC%') +
  xlab('GC%') +
  ylab('Residual WGBS - EM-seq (%)') +
  theme_minimal(12)

wide_df$delta_ont_emseq <- wide_df$meanI - wide_df$meanER
g7 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_emseq)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(20, 80), ylim=c(-50, 50)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_ont_emseq'), parse=TRUE) +
  ggtitle('') +
  xlab('GC%') +
  ylab('Residual EPIC - EM-seq (%)') +
  theme_minimal(12)

wide_df$delta_ont_wgbs <- wide_df$meanI - wide_df$meanWR
g8 <- ggplot(wide_df, aes(x=gcpct, y=delta_ont_wgbs)) +
  geom_point(size=0.5, alpha=0.05) + 
  geom_smooth(method=lm, formula='y ~ x', alpha=0.5) +
  coord_cartesian(xlim=c(20, 80), ylim=c(-50, 50)) +
  annotate('text', x=-Inf, y=Inf, hjust=0, vjust=1,
           label=lm_eqn(wide_df, 'gcpct', 'delta_ont_wgbs'), parse=TRUE) +
  ggtitle('') +
  xlab('GC%') +
  ylab('Residual EPIC - WGBS (%)') +
  theme_minimal(12)
plot_grid(g3, g6, g4, g7, g5, g8, ncol=2, rel_heights=c(1, 1, 1),
          labels=c('A', 'B', 'C', 'D', 'E', 'F'))

ggsave('three-way.beta_and_gc.pdf', width=10, height=12)

# list deps used in this script
sessionInfo()
## R version 4.1.2 (2021-11-01)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux bookworm/sid
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blis-openmp/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-2                ggplot2_3.3.5                    
##  [3] cowplot_1.1.1                     BSgenome.Hsapiens.UCSC.hg38_1.4.3
##  [5] BSgenome_1.60.0                   rtracklayer_1.52.1               
##  [7] GenomicRanges_1.44.0              Biostrings_2.60.2                
##  [9] GenomeInfoDb_1.28.4               XVector_0.32.0                   
## [11] IRanges_2.26.0                    S4Vectors_0.30.2                 
## [13] BiocGenerics_0.38.0              
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45             Rsamtools_2.8.0            
##  [3] assertthat_0.2.1            digest_0.6.29              
##  [5] utf8_1.2.2                  R6_2.5.1                   
##  [7] evaluate_0.14               highr_0.9                  
##  [9] pillar_1.6.4                zlibbioc_1.38.0            
## [11] rlang_0.4.12                jquerylib_0.1.4            
## [13] Matrix_1.4-0                rmarkdown_2.11             
## [15] splines_4.1.2               labeling_0.4.2             
## [17] BiocParallel_1.26.2         stringr_1.4.0              
## [19] RCurl_1.98-1.5              munsell_0.5.0              
## [21] DelayedArray_0.18.0         compiler_4.1.2             
## [23] xfun_0.29                   pkgconfig_2.0.3            
## [25] mgcv_1.8-38                 htmltools_0.5.2            
## [27] tidyselect_1.1.1            SummarizedExperiment_1.22.0
## [29] tibble_3.1.6                GenomeInfoDbData_1.2.6     
## [31] matrixStats_0.61.0          XML_3.99-0.8               
## [33] fansi_1.0.2                 withr_2.4.3                
## [35] crayon_1.4.2                dplyr_1.0.7                
## [37] GenomicAlignments_1.28.0    bitops_1.0-7               
## [39] grid_4.1.2                  nlme_3.1-155               
## [41] DBI_1.1.2                   jsonlite_1.7.3             
## [43] gtable_0.3.0                lifecycle_1.0.1            
## [45] magrittr_2.0.1              scales_1.1.1               
## [47] stringi_1.7.6               farver_2.1.0               
## [49] bslib_0.3.1                 ellipsis_0.3.2             
## [51] vctrs_0.3.8                 generics_0.1.1             
## [53] rjson_0.2.21                restfulr_0.0.13            
## [55] tools_4.1.2                 Biobase_2.52.0             
## [57] glue_1.6.0                  purrr_0.3.4                
## [59] MatrixGenerics_1.4.3        fastmap_1.1.0              
## [61] yaml_2.2.1                  colorspace_2.0-2           
## [63] knitr_1.37                  sass_0.4.0                 
## [65] BiocIO_1.2.0